import numpy as np
import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import sklearn
import torch
# sklearn
from sklearn import model_selection # split함수이용
from sklearn import ensemble # RF,GBM
from sklearn import metrics
# embedding
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder
def build_graph_bipartite(df_input, graph_type=nx.Graph()):
=df_input.copy()
df={x:node_id for node_id, x in enumerate(set(df["cc_num"].values.tolist()+\
mapping"merchant"].values.tolist()))}
df[
"from"]=df["cc_num"].apply(lambda x:mapping[x]) #엣지의 출발점
df["to"]=df["merchant"].apply(lambda x:mapping[x]) #엣지의 도착점
df[
= df[['from', 'to', "amt", "is_fraud"]].groupby(['from','to']).agg({"is_fraud":"sum","amt":"sum"}).reset_index()
df "is_fraud"]=df["is_fraud"].apply(lambda x:1 if x>0 else 0)
df[
=nx.from_edgelist(df[["from","to"]].values, create_using=graph_type)
G
int(x["from"]),int(x["to"])):x["is_fraud"] for idx, x in df[["from","to","is_fraud"]].iterrows()}, "label") #엣지 속성 설정,각 속성의 사기 여부부
nx.set_edge_attributes(G,{(int(x["from"]),int(x["to"])):x["amt"] for idx,x in df[["from","to","amt"]].iterrows()}, "weight") # 엣지 속성 설정, 각 엣지의 거래 금액
nx.set_edge_attributes(G,{(
return G
def build_graph_tripartite(df_input, graph_type=nx.Graph()):
=df_input.copy()
df={x:node_id for node_id, x in enumerate(set(df.index.values.tolist() +
mapping"cc_num"].values.tolist() +
df["merchant"].values.tolist()))}
df["in_node"]= df["cc_num"].apply(lambda x: mapping[x])
df["out_node"]=df["merchant"].apply(lambda x:mapping[x])
df[
=nx.from_edgelist([(x["in_node"], mapping[idx]) for idx, x in df.iterrows()] +\
G"out_node"], mapping[idx]) for idx, x in df.iterrows()], create_using=graph_type)
[(x[
"in_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")
nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")
nx.set_edge_attributes(G,{(x["in_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")
nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")
nx.set_edge_attributes(G,{(x[
return G
def down_sample_textbook(df):
= df[df.is_fraud==0].copy()
df_majority = df[df.is_fraud==1].copy()
df_minority = sklearn.utils.resample(df_majority, n_samples=len(df_minority), replace=False, random_state=42)
df_maj_dowsampled = pd.concat([df_minority, df_maj_dowsampled])
df_downsampled return df_downsampled
def embedding(Graph):
# Graph -> X (feature)
= list(Graph.edges)
_edgs = Graph.edge_subgraph([_edgs[x] for x in range(len(Graph.edges))]).copy()
subGraph list(set(Graph.nodes) - set(subGraph.nodes)))
subGraph.add_nodes_from(= AverageEmbedder(Node2Vec(subGraph, weight_key='weight').fit(window=10).wv)
embedded = [embedded[str(_edgs[x][0]), str(_edgs[x][1])] for x in range(len(Graph.edges))]
X # Graph -> y (label)
= np.array(list(nx.get_edge_attributes(Graph, "label").values()))
y return X,y
def anal(df):
= build_graph_bipartite(df)
Graph = embedding(Graph)
X,XX,y,yy = RandomForestClassifier(n_estimators=100, random_state=42)
lrnr
lrnr.fit(X,y)= lrnr.predict(XX)
yyhat = pd.DataFrame({
df 'acc':[sklearn.metrics.accuracy_score(yy,yyhat)],
'pre':[sklearn.metrics.precision_score(yy,yyhat)],
'rec':[sklearn.metrics.recall_score(yy,yyhat)],
'f1':[sklearn.metrics.f1_score(yy,yyhat)]}
) return df
def our_sampling1(df):
= set(df.query('is_fraud==1').cc_num.tolist())
cus_list return df.query("cc_num in @ cus_list")
= pd.read_csv("~/Desktop/fraudTrain.csv").iloc[:,1:] fraudTrain
= fraudTrain.assign(trans_date_trans_time= list(map(lambda x: pd.to_datetime(x), fraudTrain.trans_date_trans_time)))
fraudTrain fraudTrain
시도
= fraudTrain[fraudTrain["is_fraud"] == 0].sample(frac=0.20, random_state=42)
_df1 = fraudTrain[fraudTrain["is_fraud"] == 1]
_df2 = pd.concat([_df1,_df2])
df02 df02.shape
214520*214520
round(5) df02.is_fraud.mean().
- 사기거래 빈도..
= df02.reset_index() df02
= len(df02) N
tr/test
= sklearn.model_selection.train_test_split(df02, random_state=42) df02_tr,df02_test
round(5), df02_test.is_fraud.mean().round(5) df02_tr.is_fraud.mean().
df02_tr.shape, df02_test.shape
= [i in df02_tr.index for i in range(N)]
train_mask = [i in df02_test.index for i in range(N)] test_mask
sum(), np.array(test_mask).sum() np.array(train_mask).
데이터 돌아가는중..(일단 여기서 df02가 커널 죽음)!!! _ 내일 다시 해보자
import time
= time.time()
t1= []
edge_index_list_plus = np.array(df02['cc_num'])
_cc_num =np.array(df02['trans_date_trans_time'].apply(lambda x: x.value))
_trans_date_trans_timefor i in range(N):
for j in range(N):
if _cc_num[i] != _cc_num[j]: # cc_num 값이 다르다면
= 0
time_difference else:
= abs(_trans_date_trans_time[i] - _trans_date_trans_time[j])
time_difference
edge_index_list_plus.append([i, j, time_difference])=np.array(edge_index_list_plus)
edge_index_list_plus_nparr 'edge_index_list_plus02.npy', edge_index_list_plus_nparr)
np.save(= time.time()
t2-t1 t2
데이터 돌아가는중…………………. 다시다시
= np.array(edge_index_list_plus) edge_index
edge_index.shape
edge_index
2] = np.abs(edge_index[:,2])
edge_index[:,= edge_index[:,2].mean()
theta theta
2] = (np.exp(-edge_index[:,2]/theta)!=1) * np.exp(-edge_index[:,2]/theta)
edge_index[:, edge_index
= edge_index.tolist()
edge_index_list_updated 2].mean() np.array(edge_index_list_updated)[:,
= np.array(edge_index_list_updated)[:,2].mean() mm
시간이 평균보다 짧다면? . 음..
= [(int(row[0]), int(row[1])) for row in edge_index_list_updated if row[2] > mm] selected_edges
= torch.tensor(selected_edges, dtype=torch.long).t() edge_index_selected
edge_index_selected.shape
data설정(x, edge_index, y)
= df50_com['amt'] x
= torch.tensor(x, dtype=torch.float) a
= a.reshape(-1,1)
a a
= df50_com['is_fraud'] y
= torch.tensor(y,dtype=torch.int64) b
b
import torch_geometric
= torch_geometric.data.Data(x=a, edge_index = edge_index_selected, y=b, train_mask = train_mask, test_mask = test_mask) data
data
gnn
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
class GCN(torch.nn.Module):
def __init__(self):
super().__init__()
self.conv1 = GCNConv(1, 16)
self.conv2 = GCNConv(16,2)
def forward(self, data):
= data.x, data.edge_index
x, edge_index
= self.conv1(x, edge_index)
x = F.relu(x)
x = F.dropout(x, training=self.training)
x = self.conv2(x, edge_index)
x
return F.log_softmax(x, dim=1)
= GCN() model
model
= torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
optimizer model.train()
for epoch in range(200):
optimizer.zero_grad()= model(data)
out = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
loss
loss.backward() optimizer.step()
eval()
model.= model(data).argmax(dim=1)
pred = (pred[data.test_mask] == data.y[data.test_mask]).sum()
correct = int(correct) / int(data.test_mask.sum())
acc print(f'Accuracy: {acc:.4f}')
Accuracy: 0.9321
`